In [3]:
pip install pandas numpy scikit-learn matplotlib seaborn
Requirement already satisfied: pandas in c:\users\lenovo\anaconda3\lib\site-packages (2.2.2)
Requirement already satisfied: numpy in c:\users\lenovo\anaconda3\lib\site-packages (1.26.4)
Requirement already satisfied: scikit-learn in c:\users\lenovo\anaconda3\lib\site-packages (1.5.1)
Requirement already satisfied: matplotlib in c:\users\lenovo\anaconda3\lib\site-packages (3.9.2)
Requirement already satisfied: seaborn in c:\users\lenovo\anaconda3\lib\site-packages (0.13.2)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\lenovo\anaconda3\lib\site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in c:\users\lenovo\anaconda3\lib\site-packages (from pandas) (2024.1)
Requirement already satisfied: tzdata>=2022.7 in c:\users\lenovo\anaconda3\lib\site-packages (from pandas) (2023.3)
Requirement already satisfied: scipy>=1.6.0 in c:\users\lenovo\anaconda3\lib\site-packages (from scikit-learn) (1.13.1)
Requirement already satisfied: joblib>=1.2.0 in c:\users\lenovo\anaconda3\lib\site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\lenovo\anaconda3\lib\site-packages (from scikit-learn) (3.5.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (4.51.0)
Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (1.4.4)
Requirement already satisfied: packaging>=20.0 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (24.1)
Requirement already satisfied: pillow>=8 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (10.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (3.1.2)
Requirement already satisfied: six>=1.5 in c:\users\lenovo\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [7]:
import pandas as pd

# Load your dataset (replace the path with the correct dataset file path)
df = pd.read_csv("car_price_prediction_.csv")

# Preview the dataset
print(df.head())
   Car ID  Brand  Year  Engine Size Fuel Type Transmission  Mileage Condition  \
0       1  Tesla  2016          2.3    Petrol       Manual   114832       New   
1       2    BMW  2018          4.4  Electric       Manual   143190      Used   
2       3   Audi  2013          4.5  Electric       Manual   181601       New   
3       4  Tesla  2011          4.1    Diesel    Automatic    68682       New   
4       5   Ford  2009          2.6    Diesel       Manual   223009  Like New   

      Price     Model  
0  26613.92   Model X  
1  14679.61  5 Series  
2  44402.61        A4  
3  86374.33   Model Y  
4  73577.10   Mustang  
In [9]:
print(df.isnull().sum())
Car ID          0
Brand           0
Year            0
Engine Size     0
Fuel Type       0
Transmission    0
Mileage         0
Condition       0
Price           0
Model           0
dtype: int64
In [19]:
df.head()
Out[19]:
Car ID Brand Year Engine Size Fuel Type Transmission Mileage Condition Price Model
0 1 Tesla 2016 2.3 Petrol Manual 114832 New 26613.92 Model X
1 2 BMW 2018 4.4 Electric Manual 143190 Used 14679.61 5 Series
2 3 Audi 2013 4.5 Electric Manual 181601 New 44402.61 A4
3 4 Tesla 2011 4.1 Diesel Automatic 68682 New 86374.33 Model Y
4 5 Ford 2009 2.6 Diesel Manual 223009 Like New 73577.10 Mustang
In [21]:
df.shape
Out[21]:
(2500, 10)
In [23]:
df.isnull().sum()
Out[23]:
Car ID          0
Brand           0
Year            0
Engine Size     0
Fuel Type       0
Transmission    0
Mileage         0
Condition       0
Price           0
Model           0
dtype: int64
In [25]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car ID        2500 non-null   int64  
 1   Brand         2500 non-null   object 
 2   Year          2500 non-null   int64  
 3   Engine Size   2500 non-null   float64
 4   Fuel Type     2500 non-null   object 
 5   Transmission  2500 non-null   object 
 6   Mileage       2500 non-null   int64  
 7   Condition     2500 non-null   object 
 8   Price         2500 non-null   float64
 9   Model         2500 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 195.4+ KB
In [27]:
df.isna().sum()
Out[27]:
Car ID          0
Brand           0
Year            0
Engine Size     0
Fuel Type       0
Transmission    0
Mileage         0
Condition       0
Price           0
Model           0
dtype: int64
In [29]:
df.describe()
Out[29]:
Car ID Year Engine Size Mileage Price
count 2500.00000 2500.0000 2500.000000 2500.000000 2500.000000
mean 1250.50000 2011.6268 3.465240 149749.844800 52638.022532
std 721.83216 6.9917 1.432053 87919.952034 27295.833455
min 1.00000 2000.0000 1.000000 15.000000 5011.270000
25% 625.75000 2005.0000 2.200000 71831.500000 28908.485000
50% 1250.50000 2012.0000 3.400000 149085.000000 53485.240000
75% 1875.25000 2018.0000 4.700000 225990.500000 75838.532500
max 2500.00000 2023.0000 6.000000 299967.000000 99982.590000
In [31]:
df.describe().T.plot(kind='bar')
Out[31]:
<Axes: >
No description has been provided for this image
In [35]:
import plotly.express as px
import pandas as pd

# Assuming df is already defined

# اختيار الأعمدة الرقمية فقط
numeric_df = df.select_dtypes(include=[float, int])

# حساب مصفوفة معامل الارتباط
correlation_matrix = numeric_df.corr()

# رسم خريطة حرارية لمصفوفة معامل الارتباط باستخدام Plotly
fig = px.imshow(correlation_matrix, text_auto=True, aspect="auto",
                title='Correlation Matrix',
                color_continuous_scale='RdBu_r')
fig.show()
In [39]:
df.columns.tolist()
Out[39]:
['Car ID',
 'Brand',
 'Year',
 'Engine Size',
 'Fuel Type',
 'Transmission',
 'Mileage',
 'Condition',
 'Price',
 'Model']
In [41]:
for col in df:
    sns.histplot(x=col,data=df,kde=True)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [43]:
for col in df:
    if df[col].dtype == 'O':
        sns.countplot(x=col,data=df)
        plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [45]:
import plotly.express as px
import pandas as pd

# Assuming df is already defined and contains the data

# Define the columns you want to plot
columns =['Car ID',
 'Brand',
 'Year',
 'Engine Size',
 'Fuel Type',
 'Transmission',
 'Mileage',
 'Condition',
 'Price',
 'Model']
# Iterate through each column and create a bar chart or histogram
for column in columns:
    try:
        if column not in df.columns:
            print(f"Column {column} does not exist in the DataFrame")
            continue
        
        # Check if the column is suitable for a bar chart (categorical data)
        if df[column].dtype == 'object' or df[column].dtype.name == 'category':
            # Count the occurrences of each unique value
            column_counts = df[column].value_counts().reset_index()
            column_counts.columns = [column, 'count']
            
            # Create a bar chart using plotly.express
            fig = px.bar(
                column_counts,
                x=column,
                y='count',
                title=f'Distribution of {column}',
                labels={column: column, 'count': 'Count'},
                text='count'
            )
            
            # Update layout for better readability
            fig.update_layout(
                xaxis_title=column,
                yaxis_title='Count',
                paper_bgcolor='rgba(0,0,0,0)',
                plot_bgcolor='rgba(0,0,0,0)',
                title_font=dict(size=18, family="Arial"),
                xaxis={'categoryorder': 'total descending'}
            )
            
            # Show the figure
            fig.show()

        # For numerical data
        elif df[column].dtype in ['int64', 'float64']:
            # Create a histogram for numerical columns
            fig = px.histogram(
                df,
                x=column,
                title=f'Distribution of {column}',
                labels={column: column, 'count': 'Count'}
            )
            
            # Update layout for better readability
            fig.update_layout(
                xaxis_title=column,
                yaxis_title='Count',
                paper_bgcolor='rgba(0,0,0,0)',
                plot_bgcolor='rgba(0,0,0,0)',
                title_font=dict(size=18, family="Arial")
            )
            
            # Show the figure
            fig.show()
    
    except Exception as e:
        print(f"Could not create plot for column {column}: {e}")
In [47]:
df
Out[47]:
Car ID Brand Year Engine Size Fuel Type Transmission Mileage Condition Price Model
0 1 Tesla 2016 2.3 Petrol Manual 114832 New 26613.92 Model X
1 2 BMW 2018 4.4 Electric Manual 143190 Used 14679.61 5 Series
2 3 Audi 2013 4.5 Electric Manual 181601 New 44402.61 A4
3 4 Tesla 2011 4.1 Diesel Automatic 68682 New 86374.33 Model Y
4 5 Ford 2009 2.6 Diesel Manual 223009 Like New 73577.10 Mustang
... ... ... ... ... ... ... ... ... ... ...
2495 2496 Audi 2020 2.4 Petrol Automatic 22650 Like New 61384.10 Q5
2496 2497 Audi 2001 5.7 Hybrid Manual 77701 Like New 24710.35 A3
2497 2498 Ford 2021 1.1 Hybrid Manual 272827 Like New 29902.45 Fiesta
2498 2499 Audi 2002 4.5 Diesel Manual 229164 Like New 46085.67 Q5
2499 2500 Toyota 2005 4.6 Diesel Automatic 80978 Used 16594.14 RAV4

2500 rows × 10 columns

In [52]:
!pip install WordCloud
Requirement already satisfied: WordCloud in c:\users\lenovo\anaconda3\lib\site-packages (1.9.4)
Requirement already satisfied: numpy>=1.6.1 in c:\users\lenovo\anaconda3\lib\site-packages (from WordCloud) (1.26.4)
Requirement already satisfied: pillow in c:\users\lenovo\anaconda3\lib\site-packages (from WordCloud) (10.4.0)
Requirement already satisfied: matplotlib in c:\users\lenovo\anaconda3\lib\site-packages (from WordCloud) (3.9.2)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (4.51.0)
Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (1.4.4)
Requirement already satisfied: packaging>=20.0 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (24.1)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (2.9.0.post0)
Requirement already satisfied: six>=1.5 in c:\users\lenovo\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->WordCloud) (1.16.0)
In [53]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
import pandas as pd

# Assuming `dataset` is your DataFrame and `stop_words_list` is a list of stopwords

# Example stopwords list, modify as needed
stop_words_list = set(STOPWORDS)

# Generate frequency counts from the 'business_category' column
counts = Counter(df["Brand"].dropna().apply(lambda x: str(x)))

# Generate the word cloud
wcc = WordCloud(
    background_color="black",
    width=1600, height=800,
    max_words=2000,
    stopwords=stop_words_list
)
wcc.generate_from_frequencies(counts)

# Display the word cloud
plt.figure(figsize=(10, 5), facecolor='k')
plt.imshow(wcc, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
No description has been provided for this image
In [56]:
df.drop(columns = ["Car ID"],inplace = True )
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
le = LabelEncoder()
df["Brand"] = le.fit_transform(df["Brand"])
le = LabelEncoder()
df["Fuel Type"] = le.fit_transform(df["Fuel Type"])
le = LabelEncoder()
df["Transmission"] = le.fit_transform(df["Transmission"])
le = LabelEncoder()
df["Condition"] = le.fit_transform(df["Condition"])
le = LabelEncoder()
df["Model"] = le.fit_transform(df["Model"])
In [58]:
x = df.drop("Price", axis=1)
y = df["Price"]
scaler_df = StandardScaler()
x = pd.DataFrame(scaler_df. fit_transform(x),columns = x.columns)
plt.figure(figsize = (12,8))
sns.heatmap(data = df.corr(),annot = True, cmap='viridis')
plt.show()
No description has been provided for this image
In [60]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2, random_state = 30)
from sklearn.linear_model import LinearRegression,Lasso, Ridge,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error , mean_absolute_error
dt =DecisionTreeRegressor()
dt.fit(x_train,y_train)
dt.score(x_test,y_test)*100, dt.score(x_train,y_train)*100
Out[60]:
(-103.67627129564703, 100.0)
In [62]:
mean_squared_error(y_test ,dt.predict(x_test)),mean_absolute_error(y_test ,dt.predict(x_test))
Out[62]:
(1563338739.6327448, 32135.337119999997)
In [64]:
x_test
Out[64]:
Brand Year Engine Size Fuel Type Transmission Mileage Condition Model
679 1.484284 1.340886 -0.255097 -0.426771 -1.047528 0.151204 1.206712 1.078942
1062 -1.485868 -1.234110 -0.744004 -1.311453 0.954628 -1.639795 -1.225194 1.325715
2114 0.989259 -1.663276 -1.232910 0.457911 -1.047528 0.905017 -0.009241 0.462009
524 -0.000792 0.625610 0.233809 1.342593 0.954628 -0.498392 1.206712 -1.142017
1636 -1.485868 0.196444 -0.255097 -0.426771 -1.047528 0.833574 -1.225194 1.325715
... ... ... ... ... ... ... ... ...
552 -1.485868 -1.663276 0.652872 -1.311453 -1.047528 -1.436865 -0.009241 -1.265404
490 0.494233 0.768665 -1.442442 1.342593 -1.047528 1.080041 1.206712 0.215236
1883 0.494233 -0.947999 -1.512286 1.342593 -1.047528 -0.198628 -0.009241 -0.401697
20 0.494233 1.054776 0.303653 1.342593 -1.047528 0.487565 1.206712 -1.018630
199 -1.485868 -1.663276 0.233809 -1.311453 -1.047528 1.448143 1.206712 1.202328

500 rows × 8 columns

In [66]:
dt.predict([[1.484284,1.340886,-0.255097,-0.426771,-1.047528,0.151204,1.206712,1.078942]])
C:\Users\LENOVO\anaconda3\Lib\site-packages\sklearn\base.py:493: UserWarning:

X does not have valid feature names, but DecisionTreeRegressor was fitted with feature names

Out[66]:
array([89389.53])
In [68]:
y_test.head()
Out[68]:
679     70016.62
1062    94827.57
2114    21792.22
524     10986.59
1636    67863.46
Name: Price, dtype: float64
In [70]:
df.head()
Out[70]:
Brand Year Engine Size Fuel Type Transmission Mileage Condition Price Model
0 5 2016 2.3 3 1 114832 1 26613.92 19
1 1 2018 4.4 1 1 143190 2 14679.61 1
2 0 2013 4.5 1 1 181601 1 44402.61 3
3 5 2011 4.1 0 0 68682 1 86374.33 20
4 2 2009 2.6 0 1 223009 0 73577.10 21
In [ ]: